No need to do this today because we are going to try something new. Please open the `class3-workbook.Rmd` from the `3-class3` folder you just downloaded. The default behavior of Rstudio is to consider the folder that contains a .Rmd document as the working directory. We are going to take advantage of this and see if this fixes our knitting and file path issues. All relative paths we will use today will be with respect to the `3-class3` folder that contains the .Rmd file.
data_gene_level <- read_csv("data/input/data_gene_level.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## time = col_character(),
## replicate = col_character(),
## ensembl_gene_id = col_character(),
## hgnc_symbol = col_character(),
## type = col_character(),
## count = col_double()
## )
rp_genes <- read_csv("data/input/rp_genes.csv")
## Parsed with column specification:
## cols(
## hgnc_symbol = col_character()
## )
target_genes <- read_csv("data/input/target_genes.csv")
## Parsed with column specification:
## cols(
## hgnc_symbol = col_character()
## )
# if-else clause within mutate
data <- data_gene_level %>%
mutate(target_status =
ifelse(hgnc_symbol %in% target_genes$hgnc_symbol, "target", "non-target"))
A popular approach to plotting in R is ggplot2. While graphs produced using ggplot2 defaults are (imo) not quite publication qualitym with a few tweaks you can pretty much get there! I find STHDA to be a great resource for ggplot2 and many other R coding questions.
ggplot()ggplot(): build plots piece by piece The concept of ggplot divides a plot into three different fundamental parts:
plot = data + aesthetics + geometry.
data: a data frame
aesthetics: specify x and y variables, and other features - color, size, shape, etc.
geometry: specify type of plots - histogram, boxplot, line, density, dotplot, bar, etc.
We will make a variety of box plots now. Please see this webpage for more information.
# initialize with data
ggplot(data) # specify which dataframe to use - no plot yet
# specify x and y axes
ggplot(data, aes(x = time, y = count)) # `data` is 'data', `x` is 'time', `y` is 'count'
# specify geometry
ggplot(data, aes(x = time, y = count)) +
geom_boxplot() # `geom` is 'geom_boxplot()'
# first method
ggplot(data, aes(x = time, y = log10(count))) +
geom_boxplot()
## Warning: Removed 1418 rows containing non-finite values (stat_boxplot).
# second method
ggplot(data, aes(x = time, y = count)) +
geom_boxplot() +
scale_y_log10() # make y-axis log10
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1418 rows containing non-finite values (stat_boxplot).
# add pseudocount of 10
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() # make y-axis log10
data$time <- factor(data$time, levels = c("0h", "4h", "8h", "14h"))
# let's try the plot again...
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() # make y-axis log10
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_few() # fewer lines
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_minimal() # Minimal
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_fivethirtyeight() # Nate Silver + Co.
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") # title
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") + # my new y-axis name
xlab(label = "Hours of DUX4 expression") # my new x-axis name
ggplot(data, aes(x = time, y = count+10)) +
geom_point() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
# let's jitter the points
ggplot(data, aes(x = time, y = count+10)) +
geom_point(position = "jitter") +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
# let's make the points more transparent
ggplot(data, aes(x = time, y = count+10)) +
geom_point(position = "jitter", alpha = 0.01) +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = time, y = count+10)) +
geom_violin() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
# let's jitter the points
ggplot(data, aes(x = time, y = count+10)) +
geom_violin() +
geom_point(position = "jitter", alpha = 0.01) +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(count+10)) + # only one variable specified
geom_histogram() + # histogram
scale_x_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "Frequency") +
xlab(label = "log10(count+10")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# why can't I do this?
hist(log10(data$count+10))
# you can, while exploring data. ggplot is better to make figures since it gives you easy control of the aesthetics.
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = time, y = count+10)) +
geom_boxplot(color = "black", fill = "red") + # changing the outline and fill color
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = time, y = count+10, fill = replicate)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = replicate, y = count+10, fill = time)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = target_status, y = count+10, fill = time)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression")
ggplot(data, aes(x = target_status, y = count+10, fill = time)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression") +
scale_fill_viridis(discrete = TRUE)
A LOT more! Before we go into that, let’s look under the hood of a ggplot object.
#instead of plotting directly, let's save our plot as an object
plot <- ggplot(data, aes(x = target_status, y = count+10, fill = time)) +
geom_boxplot() +
scale_y_log10() +
theme_few() +
ggtitle("DUX4 time course") +
ylab(label = "RNA-seq read counts") +
xlab(label = "Hours of DUX4 expression") +
scale_fill_viridis(discrete = TRUE)
plot # prints plot
names(plot) # what is in the object?
## [1] "data" "layers" "scales" "mapping" "theme"
## [6] "coordinates" "facet" "plot_env" "labels"
plot$data # i can get the data back
## # A tibble: 128,808 x 8
## X1 time replicate ensembl_gene_id hgnc_symbol type count target_status
## <dbl> <fct> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 1 0h rep1 ENSG00000000003 TSPAN6 rna 57 non-target
## 2 2 0h rep1 ENSG00000000419 DPM1 rna 196 non-target
## 3 3 0h rep1 ENSG00000000457 SCYL3 rna 30 non-target
## 4 4 0h rep1 ENSG00000000460 C1orf112 rna 95.0 non-target
## 5 5 0h rep1 ENSG00000000971 CFH rna 770 non-target
## 6 6 0h rep1 ENSG00000001036 FUCA2 rna 280 non-target
## 7 7 0h rep1 ENSG00000001084 GCLC rna 26 non-target
## 8 8 0h rep1 ENSG00000001167 NFYA rna 52 target
## 9 9 0h rep1 ENSG00000001461 NIPAL3 rna 42 non-target
## 10 10 0h rep1 ENSG00000001497 LAS1L rna 145 non-target
## # … with 128,798 more rows
plot$mapping # my aesthetics
## Aesthetic mapping:
## * `x` -> `target_status`
## * `y` -> `count + 10`
## * `fill` -> `time`
plot$labels # i see my labels
## $x
## [1] "Hours of DUX4 expression"
##
## $y
## [1] "RNA-seq read counts"
##
## $title
## [1] "DUX4 time course"
##
## $fill
## [1] "time"
plot$facet
## <ggproto object: Class FacetNull, Facet, gg>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map_data: function
## params: list
## setup_data: function
## setup_params: function
## shrink: TRUE
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet, gg>
let’s say I want to make a separate p_target plot for each gene
plot + facet_wrap(~time) # `facet_wrap` time
plot + facet_wrap(~target_status) # `facet_wrap` target_status
plot + facet_wrap(~replicate) # `facet_wrap` replicate
# let's fix x-axis
plot + facet_wrap(~replicate) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) # angle 60 degree
plot1 <- plot + facet_wrap(~time)
plot2 <- plot + facet_wrap(~replicate) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) # angle 60 degree
# Now we make a figure!
plot_final <- plot_grid(plot1, plot2, labels = c("A","B"), nrow = 2)
plot_final
save_plot(filename = "images/dux4_time_course_target.png", plot = plot1, base_width = 6, base_height = 4)
save_plot(filename = "images/dux4_time_course_replicate.png", plot = plot2, base_width = 6, base_height = 4)
save_plot(filename = "images/figure2.png", plot = plot_final, base_width = 8, base_height = 8)
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.5.1 viridisLite_0.3.0 cowplot_1.0.0 scales_1.1.0
## [5] ggthemes_4.2.0 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
## [9] purrr_0.3.3 readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
## [13] ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.11 haven_2.2.0 lattice_0.20-38
## [5] colorspace_1.4-1 vctrs_0.2.1 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.0 utf8_1.1.4 rlang_0.4.2 pillar_1.4.3
## [13] withr_2.1.2 glue_1.3.1 DBI_1.1.0 dbplyr_1.4.2
## [17] modelr_0.1.5 readxl_1.3.1 lifecycle_0.1.0 munsell_0.5.0
## [21] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.5 evaluate_0.14
## [25] labeling_0.3 knitr_1.26 fansi_0.4.0 broom_0.5.3
## [29] Rcpp_1.0.3 backports_1.1.5 jsonlite_1.6 farver_2.0.1
## [33] fs_1.3.1 gridExtra_2.3 hms_0.5.2 digest_0.6.23
## [37] stringi_1.4.3 grid_3.6.2 cli_2.0.0 tools_3.6.2
## [41] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.3
## [45] zeallot_0.1.0 xml2_1.2.2 reprex_0.3.0 lubridate_1.7.4
## [49] assertthat_0.2.1 rmarkdown_2.0 httr_1.4.1 rstudioapi_0.10
## [53] R6_2.4.1 nlme_3.1-142 compiler_3.6.2